First check-in
Focus on Mexico
mex <- ind %>% filter(CountryName == "Mexico")
mex_ <- mex %>% group_by(IndicatorCode) %>%
summarise(
n = n(),
CountryName,
Year,
IndicatorCode,
IndicatorName = str_replace_all(IndicatorName, " ", "_"),
Value
) %>% filter(n > 50) %>% ungroup()
## `summarise()` has grouped output by 'IndicatorCode'. You can override using the `.groups` argument.
Focus on United States
mex <- ind %>% filter(CountryName == "United States")
mex_ <- mex %>% group_by(IndicatorCode) %>%
summarise(
n = n(),
CountryName,
Year,
IndicatorCode,
IndicatorName = str_replace_all(IndicatorName, " ", "_"),
Value
) %>% filter(n > 50) %>% ungroup()
## `summarise()` has grouped output by 'IndicatorCode'. You can override using the `.groups` argument.
mex2 <- mex_ %>% select(CountryName, Year, IndicatorCode, Value) %>%
pivot_wider(
names_from = IndicatorCode,
values_from = Value,
values_fill = 0
)
mex2
Correlation Matrix
# mex3 <- mex2 %>% select(-c(CountryName, Year, IndicatorCode)) %>%
mex3 <- mex2 %>% select(-c(CountryName, Year)) %>%
cor() %>%
round(digits = 2) %>%
data.frame()
mex3
code_to_name <- function(code) {
uh <- ind %>% select(IndicatorName, IndicatorCode) %>% filter(IndicatorCode == code) %>% unique()
uh$IndicatorName
}
# code_to_name("AG.LND.ARBL.HA.PC")
# code_to_name("EN.URB.MCTY")
# code_to_name("NE.IMP.GNFS.ZS")
# code_to_name("NY.GSR.NFCY.CD")
# code_to_name("NE.TRD.GNFS.ZS")
# code_to_name("SP.POP.TOTL")
# code_to_name("TG.VAL.TOTL.GD.ZS")
# code_to_name("NY.TAX.NIND.CN")
# code_to_name("EN.POP.DNST")
# code_to_name("FI.RES.TOTL.CD")
# code_to_name("SP.POP.65UP.TO.ZS")
# code_to_name("NY.GDP.DEFL.ZS")
# code_to_name("NY.GDP.MKTP.KN")
# code_to_name("NY.GNP.MKTP.CN")
v <- c("AG.LND.ARBL.HA.PC", "EN.URB.MCTY", "NE.IMP.GNFS.ZS", "NY.GSR.NFCY.CD", "NE.TRD.GNFS.ZS", "SP.POP.TOTL", "TG.VAL.TOTL.GD.ZS", "NY.TAX.NIND.CN", "EN.POP.DNST", "FI.RES.TOTL.CD", "SP.POP.65UP.TO.ZS")
Correlation Matrix
mex2_ <- mex2_ %>% select(AG.LND.ARBL.HA.PC,
EN.URB.MCTY,
NE.IMP.GNFS.ZS,
NY.GSR.NFCY.CD,
NE.TRD.GNFS.ZS,
SP.POP.TOTL,
TG.VAL.TOTL.GD.ZS,
# NY.TAX.NIND.CN,
EN.POP.DNST,
FI.RES.TOTL.CD,
SP.POP.65UP.TO.ZS)
# corrplot.mixed(
# round(cor(mex2_), 2),
# lower = "number",
# upper = "color"
# )
cor(mex2_) %>% corrplot(diag = FALSE, order = 'AOE')

# code_to_name("AG.LND.ARBL.HA.PC")
# code_to_name("EN.URB.MCTY")
# code_to_name("NE.IMP.GNFS.ZS")
# code_to_name("NY.GSR.NFCY.CD")
# code_to_name("NE.TRD.GNFS.ZS")
# code_to_name("SP.POP.TOTL")
# code_to_name("TG.VAL.TOTL.GD.ZS")
# code_to_name("NY.TAX.NIND.CN")
# code_to_name("EN.POP.DNST")
# code_to_name("FI.RES.TOTL.CD")
# code_to_name("SP.POP.65UP.TO.ZS")
# corrplot.mixed(
# round(cor(mex2_), 2),
# lower = "number",
# upper = "color"
# )
| AG.LND.ARBL.HA.PC |
Arable land (hectares per person) |
| EN.URB.MCTY |
Population in urban agglomerations of more than 1 million |
| NE.IMP.GNFS.ZS |
Imports of goods and services (% of GDP) |
| NY.GSR.NFCY.CD |
Net income from abroad (current US\() | | NE.TRD.GNFS.ZS | Trade (% of GDP) | | SP.POP.TOTL | Population, total | | TG.VAL.TOTL.GD.ZS | Merchandise trade (% of GDP) | | NY.TAX.NIND.CN | Net taxes on products (current LCU) | | EN.POP.DNST | Population density (people per sq. km of land area) | | FI.RES.TOTL.CD | Total reserves (includes gold, current US\)) |
| SP.POP.65UP.TO.ZS |
Population ages 65 and above (% of total) |
Data sparsity exploration
Testing Number of Values
# ind %>% group_by(CountryName) %>%
# summarise(
# n_ind_many_year = group_by(IndicatorName) %>%
# summarise(
#
# )
# )
ind %>% group_by(CountryName) %>%
summarise(
n = n(),
n_year = Year %>% unique() %>% length(),
n_ind = IndicatorName %>% unique() %>% length()
) %>% arrange(n, n_ind)
ind %>% group_by(CountryName) %>%
summarise(
n = n(),
n_year = Year %>% unique() %>% length(),
n_ind = IndicatorName %>% unique() %>% length()
) %>% arrange(CountryName)
What Indicators are in ALL conutries?
# what indicators are in ALL countries?
foo <- ind %>% group_by(IndicatorName) %>%
summarise(
n = n(),
n_count = CountryName %>% unique() %>% length()
) %>% arrange(-n_count) %>%
select(n_count, n, IndicatorName)
foo
foo %>% nrow()
## [1] 1344
ind %>% group_by(IndicatorName) %>%
summarise(
n_val = Value %>% length()
) %>% arrange(-n_val)
foo %>% filter( n_count > 200)
ind
ind %>%
filter(CountryName=="United States" & IndicatorCode=="SP.ADO.TFRT") %>%
ggplot(aes(x=Year, y=Value)) +
geom_point()
